# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Read data from CSV file
data = pd.read_csv('user_profiles_for_ads.csv')
# Display top 10 recorrds
data.head()
| User ID | Age | Gender | Location | Language | Education Level | Likes and Reactions | Followed Accounts | Device Usage | Time Spent Online (hrs/weekday) | Time Spent Online (hrs/weekend) | Click-Through Rates (CTR) | Conversion Rates | Ad Interaction Time (sec) | Income Level | Top Interests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25-34 | Female | Suburban | Hindi | Technical | 5640 | 190 | Mobile Only | 4.5 | 1.7 | 0.193 | 0.067 | 25 | 20k-40k | Digital Marketing |
| 1 | 2 | 65+ | Male | Urban | Hindi | PhD | 9501 | 375 | Tablet | 0.5 | 7.7 | 0.114 | 0.044 | 68 | 0-20k | Data Science |
| 2 | 3 | 45-54 | Female | Suburban | Spanish | Technical | 4775 | 187 | Mobile Only | 4.5 | 5.6 | 0.153 | 0.095 | 80 | 60k-80k | Fitness and Wellness |
| 3 | 4 | 35-44 | Female | Rural | Spanish | PhD | 9182 | 152 | Desktop Only | 3.1 | 4.2 | 0.093 | 0.061 | 65 | 100k+ | Gaming, DIY Crafts |
| 4 | 5 | 25-34 | Female | Urban | English | Technical | 6848 | 371 | Mobile Only | 2.0 | 3.8 | 0.175 | 0.022 | 99 | 20k-40k | Fitness and Wellness, Investing and Finance, G... |
# Check if there any missing values
data.isnull().sum()
User ID 0 Age 0 Gender 0 Location 0 Language 0 Education Level 0 Likes and Reactions 0 Followed Accounts 0 Device Usage 0 Time Spent Online (hrs/weekday) 0 Time Spent Online (hrs/weekend) 0 Click-Through Rates (CTR) 0 Conversion Rates 0 Ad Interaction Time (sec) 0 Income Level 0 Top Interests 0 dtype: int64
# Setting the aesthetic style of the plots
sns.set_style("whitegrid")
# Creating subplots for the demographic distributions
fig, axes = plt.subplots(2, 2, figsize = (18, 12))
fig.suptitle('Distribution of Key Demographic Variables')
# Age Distribution
sns.countplot(ax = axes[0, 0], x = 'Age', data = data, palette = 'coolwarm')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].tick_params(axis = 'x', rotation = 45)
# Gender Distribution
sns.countplot(ax = axes[0, 1], x = 'Gender', data = data, palette = 'coolwarm')
axes[0, 1].set_title('Gender Distribution')
# Education Level Distribution
sns.countplot(ax = axes[1,0], x = 'Education Level', data = data, palette = 'coolwarm')
axes[1, 0].tick_params(axis = 'x', rotation = 45)
# Income Level Distribution
sns.countplot(ax = axes[1, 1], x = 'Income Level', data = data, palette = 'coolwarm')
axes[1, 1].tick_params(axis = 'x', rotation = 45)
plt.tight_layout(rect = [0, 0.3, 1, 0.95])
plt.show()
# Device Usage Distribution
plt.figure(figsize = (7.5, 3.5))
sns.countplot(x = 'Device Usage', data = data, palette = 'coolwarm')
plt.title('Device Usage Distribution')
plt.show()
# Creating subplots for user online behaviour and ad interaction metrics
fig, axes = plt.subplots(3, 2, figsize = (18, 15))
fig.suptitle('User Online Behaviour and Ad Interaction Metrics')
# Time spent online on weekdays
sns.histplot(ax = axes[0, 0], x = 'Time Spent Online (hrs/weekday)', data = data, bins = 20, kde = True, color = 'skyblue')
axes[0, 0].set_title('Time Spent Online on Weekdays')
# Time spent online on weekends
sns.histplot(ax = axes[0, 1], x = 'Time Spent Online (hrs/weekend)', data = data, bins = 20, kde = True, color = 'orange')
axes[0, 1].set_title('Time Spent Online on Weekend')
# Likes and Reactions
sns.histplot(ax = axes[1, 0], x = 'Likes and Reactions', data = data, bins = 20, kde = True, color = 'green')
axes[1, 0].set_title('Likes and Reactions')
# Click-Through Rates
sns.histplot(ax = axes[1, 1], x = 'Click-Through Rates (CTR)', data = data, bins = 20, kde = True, color = 'Red')
axes[1, 1].set_title('Click-Through Rates (CTR)')
# Conversion Rates
sns.histplot(ax = axes[2, 0], x = 'Conversion Rates', data = data, bins = 20, kde = True, color = 'purple')
axes[2, 0].set_title('Conversion Rates')
# Ad Interaction Time
sns.histplot(ax = axes[2, 1], x = 'Ad Interaction Time (sec)', data = data, bins = 20, kde = True, color = 'brown')
axes[2, 1].set_title('Ad Interaction Time (sec)')
plt.tight_layout(rect = [0, 0.3, 1, 0.95])
plt.show()
# Identify most common interest among users
# Import Counter
from collections import Counter
# Splitting the 'Top Interest' column and creating a list of all interests
interests_list = data['Top Interests'].str.split(', ').sum()
# Counting the frequency of each interest
interests_counter = Counter(interests_list)
# Converting the counter object to a dataframe for easier plotting
interests_df = pd.DataFrame(interests_counter.items(),
columns = ['Interest', 'Frequency']).sort_values(by = 'Frequency', ascending = False)
# Plotting the most common interests
plt.figure(figsize = (7.5, 3.5))
sns.barplot(x = 'Frequency', y = 'Interest', data = interests_df.head(10), palette = 'coolwarm')
plt.title('Top 10 User Interests')
plt.xlabel('Frequency')
plt.ylabel('Interest')
plt.show()
Segmentation can be based on criteria such as :
# Import required libraries for clustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
# Selecting features for clustering
features = ['Age', 'Gender', 'Income Level', 'Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)',
'Likes and Reactions', 'Click-Through Rates (CTR)']
# Separating the features we want to consider for clustering
X = data[features]
# Defining preprocessing for numerical and categorical features
numeric_features = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)',
'Likes and Reactions', 'Click-Through Rates (CTR)']
numeric_transformer = StandardScaler()
categorical_features = ['Age', 'Gender', 'Income Level']
categorical_transformer = OneHotEncoder()
# Combining Preprocessing Steps
preprocessor = ColumnTransformer(
transformers = [
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# creating a preprocessing and clustering pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('cluster', KMeans(n_clusters=5, random_state=42))])
pipeline.fit(X)
cluster_labels = pipeline.named_steps['cluster'].labels_
data['Cluster'] = cluster_labels
C:\Users\Mahesh S Valanju\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
data.head()
| User ID | Age | Gender | Location | Language | Education Level | Likes and Reactions | Followed Accounts | Device Usage | Time Spent Online (hrs/weekday) | Time Spent Online (hrs/weekend) | Click-Through Rates (CTR) | Conversion Rates | Ad Interaction Time (sec) | Income Level | Top Interests | Cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25-34 | Female | Suburban | Hindi | Technical | 5640 | 190 | Mobile Only | 4.5 | 1.7 | 0.193 | 0.067 | 25 | 20k-40k | Digital Marketing | 2 |
| 1 | 2 | 65+ | Male | Urban | Hindi | PhD | 9501 | 375 | Tablet | 0.5 | 7.7 | 0.114 | 0.044 | 68 | 0-20k | Data Science | 1 |
| 2 | 3 | 45-54 | Female | Suburban | Spanish | Technical | 4775 | 187 | Mobile Only | 4.5 | 5.6 | 0.153 | 0.095 | 80 | 60k-80k | Fitness and Wellness | 0 |
| 3 | 4 | 35-44 | Female | Rural | Spanish | PhD | 9182 | 152 | Desktop Only | 3.1 | 4.2 | 0.093 | 0.061 | 65 | 100k+ | Gaming, DIY Crafts | 3 |
| 4 | 5 | 25-34 | Female | Urban | English | Technical | 6848 | 371 | Mobile Only | 2.0 | 3.8 | 0.175 | 0.022 | 99 | 20k-40k | Fitness and Wellness, Investing and Finance, G... | 2 |
# Find mean of numerical features and mode of categorical features to find defining characteristics
# Computing mean values of numerical features for each cluster
cluster_means = data.groupby('Cluster')[numeric_features].mean()
for feature in categorical_features:
mode_series = data.groupby('Cluster')[feature].agg(lambda x:x.mode()[0])
cluster_means[feature] = mode_series
cluster_means
| Time Spent Online (hrs/weekday) | Time Spent Online (hrs/weekend) | Likes and Reactions | Click-Through Rates (CTR) | Age | Gender | Income Level | |
|---|---|---|---|---|---|---|---|
| Cluster | |||||||
| 0 | 3.911111 | 5.212963 | 2409.620370 | 0.149588 | 25-34 | Female | 80k-100k |
| 1 | 1.559394 | 6.002424 | 5005.121212 | 0.179836 | 35-44 | Male | 80k-100k |
| 2 | 3.019737 | 2.584211 | 6861.587719 | 0.170614 | 25-34 | Male | 20k-40k |
| 3 | 3.080882 | 5.774510 | 7457.602941 | 0.067971 | 25-34 | Female | 100k+ |
| 4 | 1.809626 | 3.839572 | 3021.219251 | 0.056594 | 45-54 | Female | 0-20k |
""" Assign each cluster a name that reflects its most defining characteristics based on :
1. the mean values of numerical features and
2. most frequent categories for categorical features """
' Assign each cluster a name that reflects its most defining characteristics based on :\n1. the mean values of numerical features and \n2. most frequent categories for categorical features '
Cluster 0 - "Weekend Warriors" : High weekend online activity, moderate likes and reactions, predominantly male, age group 25-34, income level 80k-100k
Cluster 1 - "Engaged Professionals" : Balanced online activity, high likes and reactions, predominantly male, age group 25-34, high income (100k+)
Cluster 2 - "Low-Key Users" : Moderate to high weekend online activity, moderate likes and reactions, predominantly male, age group 25-34, income level 60k-80k, lower CTR
Cluster 3 - “Active Explorers” : High overall online activity, lower likes and reactions, predominantly female, age group 25-34, income level 60k-80k
Cluster 4 – “Budget Browsers” : Moderate online activity, lowest likes and reactions, predominantly female, age group 25-34, lowest income level (0-20k), lower CTR
# Import required libraries
import numpy as np
# Preparing data for radar chart
features_to_plot = ['Time Spent Online (hrs/weekday)', 'Time Spent Online (hrs/weekend)',
'Likes and Reactions', 'Click-Through Rates (CTR)']
labels = np.array(features_to_plot)
# Creating a dataframe for the radar chart
radar_df = cluster_means[features_to_plot].reset_index()
# Normalize the data
radar_df_normalized = radar_df.copy()
for feature in features_to_plot:
radar_df_normalized[feature] = (radar_df[feature] - radar_df[feature].min()) / (radar_df[feature].max()
- radar_df[feature].min())
# Adding a full circle for plotting
#radar_df_normalized = radar_df_normalized.append(radar_df_normalized.iloc[0])
radar_df_normalized = pd.concat([radar_df_normalized, radar_df_normalized.iloc[0]])
# Assigning names to segments
segment_names = ['Weekend Warriors', 'Engaged Professionals', 'Low-Key Users', 'Active Explorers', 'Budget Browsers']
# Import required libraries
import plotly.graph_objects as go
fig = go.Figure()
# Loop through each segment to add to the radar chart
for i, segment in enumerate(segment_names):
fig.add_trace(go.Scatterpolar(
# Add the first value at the end to close the radar chart
r = radar_df_normalized.iloc[i][features_to_plot].values.tolist() +
[radar_df_normalized.iloc[i][features_to_plot].values[0]],
# Add the first label at the end to close the radar chart
theta = labels.tolist() + [labels[0]],
fill = 'toself',
name = segment,
# Adding hover text for each feature
hoverinfo = 'text',
text = [f'{label} : {value:.2f}' for label,
value in zip(features_to_plot, radar_df_normalized.iloc[i][features_to_plot])] +
[f'{labels[0]} : {radar_df_normalized.iloc[i][features_to_plot][0]:.2f}']
))
# Update the layout to finalize the radar chart
fig.update_layout(
polar = dict(
radialaxis = dict(
visible = True,
range = [0, 1]
)),
showlegend = True,
title = 'User Segments Profile'
)
fig.show()